{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "import re\n",
    "from sklearn.utils import shuffle\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from unidecode import unidecode\n",
    "from utils import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def textcleaning(string):\n",
    "    string = re.sub('http\\S+|www.\\S+', '',' '.join([i for i in string.split() if i.find('#')<0 and i.find('@')<0]))\n",
    "    string = unidecode(string).replace('.', '. ').replace(',', ', ')\n",
    "    string = re.sub('[^\\'\\\"A-Za-z\\- ]+', ' ', string)\n",
    "    return ' '.join([i for i in re.findall(\"[\\\\w']+|[;:\\-\\(\\)&.,!?\\\"]\", string) if len(i)>1]).lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(40911, 7)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('toxic-bm.csv')\n",
    "df = df.dropna()\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(df.shape[0]):\n",
    "    df.iloc[i,0] = textcleaning(df.iloc[i,0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "vocab from size: 65530\n",
      "Most common words [('yang', 103249), ('anda', 68130), ('dan', 59109), ('tidak', 54237), ('untuk', 50427), ('di', 36812)]\n",
      "Sample data [455, 91, 199, 4, 242, 9, 106, 835, 3243, 8165] ['penjelasan', 'mengapa', 'pengeditan', 'yang', 'dibuat', 'di', 'bawah', 'peminat', 'tegar', 'metallica']\n"
     ]
    }
   ],
   "source": [
    "texts = df.iloc[:,0].tolist()\n",
    "concat = ' '.join(texts).split()\n",
    "vocabulary_size = len(list(set(concat)))\n",
    "data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)\n",
    "print('vocab from size: %d'%(vocabulary_size))\n",
    "print('Most common words', count[4:10])\n",
    "print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def str_idx(corpus, dic, maxlen, UNK=3):\n",
    "    X = np.zeros((len(corpus),maxlen))\n",
    "    for i in range(len(corpus)):\n",
    "        for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):\n",
    "            try:\n",
    "                X[i,-1 - no]=dic[k]\n",
    "            except Exception as e:\n",
    "                X[i,-1 - no]=UNK\n",
    "    return X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(40911, 6)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list_classes = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]\n",
    "Y = df[list_classes].values\n",
    "Y.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "vectors = str_idx(texts, dictionary, 200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(vectors, Y, test_size = 0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "from keras.optimizers import Adam, RMSprop\n",
    "from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler\n",
    "from keras.layers import GRU, BatchNormalization, Conv1D, MaxPooling1D\n",
    "from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU\n",
    "from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten\n",
    "from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D\n",
    "from keras.models import Model, load_model\n",
    "from keras import initializers, regularizers, constraints, optimizers, layers, callbacks\n",
    "from keras import backend as K\n",
    "from keras.engine import InputSpec, Layer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "early_stop = EarlyStopping(monitor = \"val_loss\", mode = \"min\", patience = 5)\n",
    "file_path = \"best_model.hdf5\"\n",
    "check_point = ModelCheckpoint(file_path, monitor = \"val_loss\", verbose = 1, save_best_only = True, mode = \"min\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 32728 samples, validate on 8183 samples\n",
      "Epoch 1/10\n",
      "32728/32728 [==============================] - 175s 5ms/step - loss: 0.2176 - acc: 0.9541 - val_loss: 0.1432 - val_acc: 0.9627\n",
      "\n",
      "Epoch 00001: val_loss improved from inf to 0.14315, saving model to best_model.hdf5\n",
      "Epoch 2/10\n",
      "32728/32728 [==============================] - 174s 5ms/step - loss: 0.1390 - acc: 0.9628 - val_loss: 0.1327 - val_acc: 0.9628\n",
      "\n",
      "Epoch 00002: val_loss improved from 0.14315 to 0.13274, saving model to best_model.hdf5\n",
      "Epoch 3/10\n",
      "32728/32728 [==============================] - 173s 5ms/step - loss: 0.0985 - acc: 0.9688 - val_loss: 0.0832 - val_acc: 0.9733\n",
      "\n",
      "Epoch 00003: val_loss improved from 0.13274 to 0.08320, saving model to best_model.hdf5\n",
      "Epoch 4/10\n",
      "32728/32728 [==============================] - 181s 6ms/step - loss: 0.0654 - acc: 0.9773 - val_loss: 0.0725 - val_acc: 0.9758\n",
      "\n",
      "Epoch 00004: val_loss improved from 0.08320 to 0.07247, saving model to best_model.hdf5\n",
      "Epoch 5/10\n",
      "32728/32728 [==============================] - 179s 5ms/step - loss: 0.0567 - acc: 0.9798 - val_loss: 0.0690 - val_acc: 0.9764\n",
      "\n",
      "Epoch 00005: val_loss improved from 0.07247 to 0.06903, saving model to best_model.hdf5\n",
      "Epoch 6/10\n",
      "32728/32728 [==============================] - 174s 5ms/step - loss: 0.0505 - acc: 0.9814 - val_loss: 0.0698 - val_acc: 0.9766\n",
      "\n",
      "Epoch 00006: val_loss did not improve from 0.06903\n",
      "Epoch 7/10\n",
      "32728/32728 [==============================] - 265s 8ms/step - loss: 0.0460 - acc: 0.9829 - val_loss: 0.0683 - val_acc: 0.9765\n",
      "\n",
      "Epoch 00007: val_loss improved from 0.06903 to 0.06827, saving model to best_model.hdf5\n",
      "Epoch 8/10\n",
      "32728/32728 [==============================] - 208s 6ms/step - loss: 0.0426 - acc: 0.9841 - val_loss: 0.0701 - val_acc: 0.9765\n",
      "\n",
      "Epoch 00008: val_loss did not improve from 0.06827\n",
      "Epoch 9/10\n",
      "32728/32728 [==============================] - 181s 6ms/step - loss: 0.0394 - acc: 0.9854 - val_loss: 0.0756 - val_acc: 0.9766\n",
      "\n",
      "Epoch 00009: val_loss did not improve from 0.06827\n",
      "Epoch 10/10\n",
      "32728/32728 [==============================] - 234s 7ms/step - loss: 0.0369 - acc: 0.9863 - val_loss: 0.0778 - val_acc: 0.9765\n",
      "\n",
      "Epoch 00010: val_loss did not improve from 0.06827\n"
     ]
    }
   ],
   "source": [
    "inp = Input(shape = (None,))\n",
    "x = Embedding(len(dictionary), 256, trainable=True)(inp)\n",
    "x1 = SpatialDropout1D(0.2)(x)\n",
    "\n",
    "x = Bidirectional(GRU(128, return_sequences = True))(x1)\n",
    "x = Conv1D(64, kernel_size = 2, padding = \"valid\", kernel_initializer = \"he_uniform\")(x)\n",
    "    \n",
    "y = Bidirectional(LSTM(128, return_sequences = True))(x1)\n",
    "y = Conv1D(64, kernel_size = 2, padding = \"valid\", kernel_initializer = \"he_uniform\")(y)\n",
    "    \n",
    "avg_pool1 = GlobalAveragePooling1D()(x)\n",
    "max_pool1 = GlobalMaxPooling1D()(x)\n",
    "    \n",
    "avg_pool2 = GlobalAveragePooling1D()(y)\n",
    "max_pool2 = GlobalMaxPooling1D()(y)\n",
    "    \n",
    "x = concatenate([avg_pool1, max_pool1, avg_pool2, max_pool2])\n",
    "\n",
    "x = Dense(6, activation = \"sigmoid\")(x)\n",
    "model = Model(inputs = inp, outputs = x)\n",
    "model.compile(loss = \"binary_crossentropy\", optimizer = Adam(lr = 1e-4), metrics = [\"accuracy\"])\n",
    "history = model.fit(train_X, train_Y, batch_size = 128, epochs = 10, validation_data = (test_X, test_Y), \n",
    "                    verbose = 1, callbacks = [check_point, early_stop])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = load_model(file_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8183/8183 [==============================] - 16s 2ms/step\n"
     ]
    }
   ],
   "source": [
    "predicted = model.predict(test_X,batch_size=128,verbose=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "around_predicted = np.around(predicted)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.78      0.63      0.69       817\n",
      "          1       0.58      0.19      0.28        81\n",
      "          2       0.79      0.61      0.69       427\n",
      "          3       0.00      0.00      0.00        30\n",
      "          4       0.72      0.50      0.59       398\n",
      "          5       0.00      0.00      0.00        80\n",
      "\n",
      "avg / total       0.71      0.54      0.61      1833\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    }
   ],
   "source": [
    "from sklearn import metrics\n",
    "print(metrics.classification_report(test_Y,around_predicted))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.94984066, 0.7050257 , 0.88807845, 0.40766585, 0.8686688 ,\n",
       "        0.55271745]], dtype=float32)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.predict(str_idx(['bodoh lah anti sosial'], dictionary, 7))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "with open('fast-text-toxic.json','w') as fopen:\n",
    "    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
